from __future__ import division

import math


class Statistics(object):
    def ttest_ind(self, *args):
#        if len(args) == 1:
#            if not isinstance(args[0], (DataTable, list)):
#                raise ValueError('Single argument must be DataTable or list')
#            columns

        if len(args) > 2:
            raise ValueError('ttest is 2-way')

        from scipy.stats import ttest_ind
        return ttest_ind(*[list(col) for col in self.slice(*args).clean().columns()])

    def regression(self, *indexes):
        from stats.multivar import regression
        s = self.slice(*indexes)
        if s.width() > 1:
            l = self.slice(*indexes).clean().columns()
            cols = map(list, self.slice(*indexes).clean().columns())
            return regression(cols[0], *cols[1:])
        return None

    def _summarize(self, operation, *indexes):
        indexes = self._check_indexes(*indexes)
        cols = self.columns(*indexes)
        singular = not isinstance(cols, list)
        if singular:
            cols = [cols]
        res = [operation(list(x.clean())) for x in cols]
        if singular:
            return res[0]
        return tuple(res)

    def cdf(self, index):
        from presentation.plot import Plot

#        print self.cumfreq(index).list_of_lists()
        return Plot(self.cumfreq(index))

    def x_cdf(self, index):
        import scikits.statsmodels as sm

        index = self._check_indexes(index)[0]

        sample = list(self.columns(index))
        ecdf = sm.tools.ECDF(sample)
        from numpy import linspace
        x = linspace(min(sample), max(sample))
        y = ecdf(x)

        xt = self.__class__(self.column_names[index], 'CDF')
        xt.extend(zip(x, y))

        from presentation.plot import Plot

        return Plot(xt)

    def cumfreq(self, index):
        from scipy.stats import cumfreq

        values = list(self.columns(index).clean())
        v = cumfreq(values)
        d = self.__class__('n', 'CDF')
        for i, vi in enumerate(v[0]):
            d.add(i * v[2], vi / len(values))

#        import numpy
#        counts, bin_edges = numpy.histogram(list(self.columns(index)), bins=20, normed=True)
#        cdf = numpy.cumsum(counts)
#        d = DataTable('bin', 'value')
#        d.extend(izip(bin_edges[1:], cdf))
        return d

    def skew(self, *indexes):
        """ Skew of columns, one per column.  Columns are cleaned. """

        from scipy.stats import skew
        return self._summarize(skew, *indexes)

    def kurtosis(self, *indexes):
        """ Kurtosis of columns, one per column.  Columns are cleaned. """

        from scipy.stats import kurtosis
        return self._summarize(kurtosis, *indexes)

    def stddev(self, *indexes):
        """ std of columns, one per column.  Columns are cleaned. """

        from numpy import std
        return self._summarize(std, *indexes)

    def mean(self, *indexes):
        """ Mean of columns, one per column.  Columns are cleaned. """

        from numpy import mean

        def mean_if(values):
            res = mean(values)
            if math.isnan(res):
                return None
            return res

        return self._summarize(mean_if, *indexes)

    def histogram(self, index):
        raise NotImplementedError()
        nbuckets = 10
        c = self.columns(index)
        mn, mx = c.minmax()
        bucket_size = (mx - mn) / (nbuckets - 1)

        buckets = [0] * nbuckets

        for value in c:
            index = int((value - mn) / bucket_size)
            buckets[index] += value

        return buckets
